Introduction + Methods

This is the 3rd time I’m trying to estimate the needed sample size for our study. At this point, you should (hopefully) know the drill…
This time we will try to estimate our needed N with another simulation approach using deviance to a “true score” which we will get with sampling a lot.

We use this sample sequence:

sample_sequence
##  [1]   20   30   40   50   60   70   80   90  100  150  200  250  500  750 1000
## [16] 1250 1500 1750 2000 2250 2500 2750 3000 5000

And for each of these sample sizes, we simulate 400 times (while higher numbers would lead to a more robust/ reliable finding, I still need to make sure that it can still be computable)…

Here are our assumed population distribution, which we will sample from again:

normal <- round(rnorm(n_population, mean = 9,sd = 1.2))
normal <- pmin(pmax(normal, min_likert), max_likert)

strong_pol <- round(rbeta(n_population, shape1 = .1, shape2 = .1)*10) +1
strong_pol <- pmin(pmax(strong_pol, min_likert), max_likert)

rare <- c(round(rnorm(n_population *(1- prop_minority), mean = 2,sd = 1.3)), 
           round(rnorm(n_population *prop_minority, mean = 11,sd = .5)))
rare <- pmin(pmax(rare, min_likert), max_likert)

normal_2 <- round(rnorm(n_population, mean = 8,sd = 1))
normal_2 <- pmin(pmax(normal_2, min_likert), max_likert)


Pop_df <- data.frame(`Normal Distribution`= normal,
                     `Normal Distribution 2` = normal_2,
                     `Rare Polarization` = rare,
                     `Strong Polarization` = strong_pol)

names(Pop_df) <- c("None", "None 2", "Rare", "Strong Pol.")



Pop_df_plot <- Pop_df %>% pivot_longer(everything(), values_to = "Rating", names_to = "Distr") %>% 
   mutate(Distr = factor(Distr, levels = c("None", "None 2","Rare", "Strong Pol.")))

Pop_df_plot %>% ggplot(aes(Rating, fill = Distr))+
   geom_bar(width= .75)+
   facet_grid(rows = vars(Distr))+
   scale_x_continuous(n.breaks = 11, expand = c(0,0))+
   theme_minimal()+
   theme(strip.text = element_blank(),
         axis.text.y = element_blank(),
         legend.position = "none",
         plot.margin = margin(1, 2, 0, 0),
         axis.text.x = element_text(size = 12))+
   geom_text(aes(x=6,y = 3500, label= Distr, color = Distr), size = 6.8)+
   scale_fill_manual(values = colors)+
   scale_color_manual(values = colors)+
   ylab("Count")

# max_cols <- max(sampled)
# 
# # Register parallel backend with the desired number of cores
# num_cores <- detectCores()-1
# 
# cl <- makeCluster(num_cores)
# registerDoParallel(cl)
# 
# #rotate Pop_df, so our function works (each row needs to be a different distribution, instead of each col)
# rot_Pop_df <- as.data.frame(t(Pop_df))
# 
# risk_dist_tr <- data.frame(risk_distribution = 1:4,
#            risk_dist_transl = rownames(rot_Pop_df))
# 
# # Define function to process each combination of risk_distribution, samplesize and replications per setting
# sample_and_replicate_for_all_risks <- function(i) {
#   sampled_matrix_list <- list()
# 
#   for (j in 1:nrow(rot_Pop_df)) {
#     mat <- replicate(replications_per_setting,
#                      sample(1:n_population, size = sampled[i, 1], replace = TRUE)) #create matrix of our samples with replacing, times n - replications
# 
#     sampled_table <- rot_Pop_df[j, mat] #using the matrix, collect the values from our risk distribution matrix (as a vector though...)
#     sampled_matrix <- as.data.frame(matrix(sampled_table,
#                                            nrow = replications_per_setting,
#                                            ncol = sampled[i, 1],
#                                            byrow = TRUE,
#                                            dimnames = NULL)) #create df out of these vectors instead of flat vectors
# 
# 
#     if (ncol(sampled_matrix) < max_cols) {
#       padding_matrix <- matrix(NA,
#                                nrow = nrow(sampled_matrix),
#                                ncol = max_cols - ncol(sampled_matrix))
#       sampled_matrix <- cbind(sampled_matrix, padding_matrix)
#     } #if matrix is not wide enough for our end result matrix, padd it with NA columns, so binding rows is doable (needs same amount of ncols)
# 
#     sampled_matrix <- cbind(matrix(sampled[i,1], nrow = replications_per_setting),
#                             matrix(j, nrow = replications_per_setting),
#                             sampled_matrix) #bind columns with additional information such as sample size and which risk_distribution was sampled
# 
#     colnames(sampled_matrix) <- c("sample_size", "risk_distribution", paste0("rating_", 1:max_cols)) #rewrite colnames so it is identical to the bigh matrix
#     sampled_matrix_list[[j]] <- sampled_matrix #store in list
#   }
#   return(do.call(rbind, sampled_matrix_list)) #after all risk distributions are sampled from, bind them all and return the output
# 
# }
# 
# # Perform parallel processing using foreach, iterating through the different samplesizes
# result <- foreach(i = 1:nrow(sampled), .combine = rbind) %dopar% {
#   sample_and_replicate_for_all_risks(i)
# }
# 
# # Stop the parallel backend
# stopCluster(cl)
# 
# result <- mutate_all(result, as.numeric)
# 
# 
# vis_miss(result[,seq(from =3, to = ncol(result), length.out =20)], show_perc_col = F)
#taken from my previous work

# calc_bimodality_coefficient <- function(vec){
#    skew <- skew(vec, na.rm = TRUE, type = 3)
#    kurtosis <- kurtosi(vec, na.rm = TRUE, type = 3)
#    n <- sum(!is.na(vec))
#    return((skew^2+1) / (kurtosis + ((3*((n-1))^2)/((n-2)*(n-3))) ))
# }
# 
# # Created this one "from scratch", as we work with a smaller scale of 11 instead of 101, the calculation of polarization does not take too long anymore
# 
# calc_polarization <- function(vec){
#    vec2 <- as.vector(vec)
#    freq_vec <- agrmt::collapse(vec2, pos = c(1:11))
#    return(agrmt::polarization(freq_vec))
# }
# 
# # this code is commented, as the computation takes too long, and I hate waiting while kniting. I have saved a sepparate rds file, which will be read in instead. Readers who want to uncomment section, select the lines to uncomment and press Ctrl + Shift + C (on Windows/Linux) or Cmd + Shift + C (on macOS)
# #
# # #apply the functions to our result matrix, therefore calculate for each drawn sample the polarization metrics
# BC_result <- apply(result[,-c(1:2)], 1, calc_bimodality_coefficient)
# sum(is.na(BC_result))
# 
# 
# # Register parallel backend with the desired number of cores
# num_cores <- detectCores()-1
# 
# cl <- makeCluster(num_cores)
# registerDoParallel(cl)
# 
# # Perform parallel processing using foreach, iterating through the different drawn samples
# polarization_result <- foreach(i = 1:nrow(result), .combine = rbind) %dopar% {
#   calc_polarization(result[i, -c(1:2)])
# }
# sum(is.na(polarization_result))
# 
# # Stop the parallel backend
# stopCluster(cl)
# 
# 
# combined_result_measures <- cbind(polarization_result,
#                                   BC_result,
#                                   result[,1:2]
#                                   )
# sum(is.na(combined_result_measures))
# 
# 
# combined_result_measures <- combined_result_measures %>%
#    left_join(risk_dist_tr, by = "risk_distribution") %>%
#    mutate(fsample_size = factor(sample_size, ordered = T))
# 
# saveRDS(combined_result_measures, "saved_RDS\\combined_result_measures_power_analysis_deviance.rds")

combined_result_measures <- readRDS("saved_RDS\\combined_result_measures_power_analysis_deviance.rds")



vis_miss(combined_result_measures, sort_miss = F)

Results

In the following density plots, we see the distribution of each calculated polarization measure (from the book “Triggerpunkte” from Mau et al.), which ranges from 0 (not polarized, people are in agreement) to 1 (highest polarization). We can see that the more we sample, the more the results converge to a “true” score for said assumed population distribution.

combined_result_measures %>% 
   ggplot(aes(polarization_result))+
   geom_density(aes(fill = fsample_size), alpha = .4)+
   facet_grid(rows =vars(risk_dist_transl))

And for the bimodality Coefficient as well:

combined_result_measures %>% 
   ggplot(aes(BC_result))+
   geom_density(aes(fill = fsample_size), alpha = .4)+
   facet_grid(rows =vars(risk_dist_transl))

In the following, we will calculate the mode of each density plot (that is, where the polarization measures converges or has a high probability that it ends up on X, if we were to sample it fewer times). We can then plot these findings, and should look at the point where the modes converges towards the “true score” (in our case, the sample size of 5000).

combined_result_measures <- combined_result_measures %>% 
   mutate(grouped_distXsample = paste(risk_dist_transl, fsample_size, sep = "_"))

splitted <- split(combined_result_measures$polarization_result, f = combined_result_measures$grouped_distXsample)
splitted2 <- split(combined_result_measures$BC_result, f = combined_result_measures$grouped_distXsample)

mode_dens_res <- data.frame()

for (i in 1:length(splitted)) {

   dens_res <- density(splitted[[i]])
   dens_res2 <- density(splitted2[[i]])
   
   dens_df <- data.frame(group = names(splitted)[i],
                         mode_dens_pol = dens_res$x[which.max(dens_res$y)],
                         mode_dens_BC = dens_res2$x[which.max(dens_res2$y)])
   
   mode_dens_res <- rbind(mode_dens_res, dens_df)
   
}

cleaned_mode_dens_res <- mode_dens_res %>% 
   separate(group, sep = "_", into = c("Distribution", "fsample_size")) %>% 
   mutate(Sample_Size = as.integer(fsample_size)) %>% 
   pivot_longer(cols = starts_with("mode_dens"),
                names_prefix = "mode_dens_",
                names_to = "measure",
                values_to = "value")
   

cleaned_mode_dens_res %>% 
   ggplot(aes(Sample_Size, value, group = Distribution, col = Distribution))+
   geom_line(size= 1.1)+
   facet_grid(rows = vars(measure))+
   scale_x_continuous(breaks = sample_sequence)+
   theme_minimal()+
   scale_color_manual(values = colors)

Probability of Sample being in True Score Interval

While the mode could a be good indicator, one might also argue that it is more relevant to have a good chance of capturing the true polarization value. That is, the probability that the measure is only off by only .1 (meaning it is always within a interval of + 0.05 and - 0.05) from the true score. So let’s calculate the % where the 400 simulations per sample size was within this interval. One could then interpret the results as: the probability of coming to the same polarization value (with an interval of .1) compared to if we had sampled 5’000 people.

Interval of .1

# calculate the "true score" value of said polarization measure for each population distribution

true_score <- combined_result_measures %>% 
   filter(sample_size == 5000) %>% 
   group_by(risk_dist_transl) %>% 
   summarize(mean_true_score_BC = mean(BC_result),
             mean_true_score_pol = mean(polarization_result))




combined_result_measures_within_interval <- combined_result_measures %>% 
   left_join(true_score, by = c("risk_dist_transl")) %>% 
   mutate(within_BC = if_else(BC_result >= mean_true_score_BC- lower_and_upper_interval_limit &
                               BC_result <= mean_true_score_BC + lower_and_upper_interval_limit, 1, 0),
          within_pol = if_else(polarization_result >= mean_true_score_pol- lower_and_upper_interval_limit &
                               polarization_result <= mean_true_score_pol + lower_and_upper_interval_limit, 1, 0))

summarized_within_interval <-
   combined_result_measures_within_interval %>%
   group_by(risk_dist_transl, sample_size) %>%
   summarize(count_within_BC = sum(within_BC),
      count_within_pol = sum(within_pol),
      perc_within_BC = sum(within_BC) / replications_per_setting * 100,
      perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>%
   pivot_longer(cols = starts_with("perc"),
      names_to = "measure",
      values_to = "percentage",
      names_prefix = "perc_within_")

summarized_within_interval %>% ggplot(aes(factor(sample_size), percentage, col = risk_dist_transl, group = risk_dist_transl))+
   geom_point()+
   geom_line()+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.05 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.05")+
   scale_color_manual(values = colors)+
   theme(legend.position = "none")

Interval of .05

As the interval of .05 (which results of a coverage of .1) was “arbitrarily” taken, let’s use the same procedure for an interval of .025 (or a span of only 0.05). In this case, one would expect that we need a bigger sample size, as the margin for “error” is smaller where we would be confident enough that we have captured the true score.

combined_result_measures_within_interval <- combined_result_measures %>% 
   left_join(true_score, by = c("risk_dist_transl")) %>% 
   mutate(within_BC = if_else(BC_result >= mean_true_score_BC- lower_and_upper_interval_limit2 &
                               BC_result <= mean_true_score_BC + lower_and_upper_interval_limit2, 1, 0),
          within_pol = if_else(polarization_result >= mean_true_score_pol- lower_and_upper_interval_limit2 &
                               polarization_result <= mean_true_score_pol + lower_and_upper_interval_limit2, 1, 0))

summarized_within_interval <-
   combined_result_measures_within_interval %>%
   group_by(risk_dist_transl, sample_size) %>%
   summarize(count_within_BC = sum(within_BC),
      count_within_pol = sum(within_pol),
      perc_within_BC = sum(within_BC) / replications_per_setting * 100,
      perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>%
   pivot_longer(cols = starts_with("perc"),
      names_to = "measure",
      values_to = "percentage",
      names_prefix = "perc_within_")

summarized_within_interval %>% ggplot(aes(factor(sample_size), percentage, col = risk_dist_transl, group = risk_dist_transl))+
   geom_point()+
   geom_line()+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.025 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.025")+
   scale_color_manual(values = colors)+
   theme(legend.position = "none")

Difference between two polarizations

The approach above would only tell us how likely it is that the sampled distribution would lead to almost the same polarization value if one had sampled 5’000 people (or in a sense, a “true score”, as it converges to said population distribution). In the following section, we will go one step further and analyse how likely it is to find the same difference between two population distributions and two sampled distributions.

true_score_long <- true_score %>% 
   pivot_longer(cols = starts_with("mean_true_score"),
                names_prefix = "mean_true_score_",
                names_to = "measure",
                values_to = "value")


true_score_expanded <- expand.grid(x = true_score$risk_dist_transl, y =true_score$risk_dist_transl) %>% 
   filter(x != y) %>% 
   distinct() %>% 
   left_join(true_score, by = c("x" = "risk_dist_transl")) %>% 
   left_join(true_score, by = c("y" = "risk_dist_transl"), suffix = c("_x", "_y")) %>% 
   mutate(true_diff_BC = mean_true_score_BC_x - mean_true_score_BC_y,
          true_diff_pol = mean_true_score_pol_x - mean_true_score_pol_y)


#because we have duplicate combinations (e.g. abs(x - y) == abs(y - x)), I'll sort them and take half of it (e.g. only the neccessary combinations)
n_unique <- nrow(true_score) * (nrow(true_score)-1) / 2

true_score_unique <- true_score_expanded %>% 
   arrange(desc(true_diff_BC)) %>% 
   slice_head(n = n_unique) %>% 
   select(x, y, true_diff_BC, true_diff_pol)



true_score_unique %>% 
   kable(digits = 2)
x y true_diff_BC true_diff_pol
Strong Pol. None 2 0.54 0.70
Strong Pol. None 0.50 0.68
Rare None 2 0.42 0.06
Rare None 0.38 0.04
Strong Pol. Rare 0.12 0.64
None None 2 0.04 0.02

These are the true score differences between two distributions if one were to average the difference from sampling 5000 ratings for each population.

In the following, we will calculate the difference between the calculated polarization measures from another distribution. Again, we will take two intervals: .1 and .05 (that is, the sampled difference is allowed to deviate by .05 or .025 from the true score on either side).

Interval of .1

combined_result_measures_selected <- combined_result_measures %>% 
   select(contains("result"), sample_size, risk_dist_transl) %>% 
   group_by(risk_dist_transl, sample_size) %>% 
   mutate(sample_ID = row_number())

diff_distributions <- true_score_unique %>% 
   left_join(combined_result_measures_selected, by = c("x" = "risk_dist_transl")) %>% 
   left_join(combined_result_measures_selected, by = c("y" = "risk_dist_transl", "sample_size", "sample_ID"), suffix = c("_x", "_y")) %>% 
   mutate(diff_BC = BC_result_x - BC_result_y,
          diff_pol = polarization_result_x - polarization_result_y)

within_interval_diff_distributions <- diff_distributions %>% 
   mutate(within_BC = if_else(true_diff_BC >= diff_BC - lower_and_upper_interval_limit &
                                 true_diff_BC <= diff_BC + lower_and_upper_interval_limit, 1, 0),
          within_pol = if_else(true_diff_pol >= diff_pol - lower_and_upper_interval_limit &
                                 true_diff_pol <= diff_pol + lower_and_upper_interval_limit, 1, 0),
          comparison = paste(x, y, sep = " - ")
   ) %>% 
   group_by(comparison, sample_size) %>% 
   summarize(perc_within_BC = sum(within_BC) / replications_per_setting * 100,
             perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>% 
   pivot_longer(cols = starts_with("perc"),
                names_to = "measure",
                values_to = "percentage",
                names_prefix = "perc_within_")
   

within_interval_diff_distributions %>% ggplot(aes(factor(sample_size), percentage, col = factor(comparison), group = factor(comparison)))+
   geom_point(size = 2)+
   geom_line(linewidth = 1.2)+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.05 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.05",
        color = "Comparison between")+
   scale_color_manual(values = colors2)+
   theme(legend.spacing = unit(0.1, "cm"),
         legend.margin = margin(0,0,0,0),
         legend.box.margin = margin(0,0,0,0),
         legend.position = c(.5,.05),
         legend.direction = "horizontal")

Interval of .05

within_interval_diff_distributions <- diff_distributions %>% 
   mutate(within_BC = if_else(true_diff_BC >= diff_BC - lower_and_upper_interval_limit2 &
                                 true_diff_BC <= diff_BC + lower_and_upper_interval_limit2, 1, 0),
          within_pol = if_else(true_diff_pol >= diff_pol - lower_and_upper_interval_limit2 &
                                 true_diff_pol <= diff_pol + lower_and_upper_interval_limit2, 1, 0),
          comparison = paste(x, y, sep = " - ")
   ) %>% 
   group_by(comparison, sample_size) %>% 
   summarize(perc_within_BC = sum(within_BC) / replications_per_setting * 100,
             perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>% 
   pivot_longer(cols = starts_with("perc"),
                names_to = "measure",
                values_to = "percentage",
                names_prefix = "perc_within_")
   

within_interval_diff_distributions %>% ggplot(aes(factor(sample_size), percentage, col = factor(comparison), group = factor(comparison)))+
   geom_point(size = 2)+
   geom_line(linewidth = 1.2)+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.025 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.025",
        color = "Comparison between")+
   scale_color_manual(values = colors2)+
   theme(legend.spacing = unit(0.1, "cm"),
         legend.margin = margin(0,0,0,0),
         legend.box.margin = margin(0,0,0,0),
         legend.position = c(.5,.05),
         legend.direction = "horizontal")

Discussion

As one has expected, the “needed” sample size increases the stricter we set the intervals which we would be satisfied with.

If one were to only look at the plots with the stricter interval:

  • On the side of bimodality coefficient, one could argue to sample around 1250 participants so that we have a good chance of around 80% to get a good estimate for rare distributions (so that in case we do sample from a rare distribution, in 80% of the time, the true bimodality coefficient is off by only a small amount). If one wants to make sure, even sampling 2000 participants would be nice to increase the chance to 90%, but maybe that is stretching the budget a little bit…

  • For the measure of polarization, estimating the true score value is hardest if one has a strongly polarized distribution. In this case, we have a 80% chance of getting it “right” if one would sample 1250 participants, or 90% for sampling 2000 respectively.

The numbers here are only suggestions, and are only based on precision, so that we can be sure that the results we get are reliable enough. In any case, these numbers are “only” worst case scenarios, as the other population distributions are converging earlier. We are also only looking at the stricter scenario…

When looking for the difference between two distributions, one need to sample 1500 or even 2750 people for 80% or 90% chance of being near the true score.

Limitations

  • Discussion was solely based on the stricter interval (e.g. the last plot), not on the lenient one with a broader interval.
  • As always, the simulation should be taken with a grain of salt, as these results are made with the assumption that the distribution is as “beautiful” as I made it in the introduction section…
  • Whether we can truly sample as random as we want to, or we get some subsamples is not of debate here. This work just assumes that the whole population has X distribution.
  • No comparison between different sample distributions were made (e.g. overlap coefficient), so we still have no idea how big the sample size needs to be if we want to discover a difference of X between two countries/ groups in case there is a difference.

Credits

R Packages Used

---
title: "Risk Polarization Power Analysis using Deviance"
author: "Andy Cao"
date: "2024-04-23"
output: 
   html_document:
      css: styles.css
      toc: true
      number_sections: false
      toc_float: true
      collapsed: true
      smooth_controll: false
      fig.width: 26
      fig.height: 26
      code_download: true
      code_folding: hide
---

# Introduction + Methods

```{r Setup, include=FALSE, warning=FALSE, error=FALSE, message=FALSE}
library(tidyverse) #data wrangling and other tools for R
library(knitr) # report generation in R
library(psych) #calculate skew and kurtosis for BC
library(agrmt) #for agreement and polarization calculation
library(visdat) #visualize dataframes in plots
library(RColorBrewer) #easy to use color palettes
library(rmarkdown) #for the paged_table function
library(doParallel) #parallel computation using multiple cores
library(foreach) # for each function, so the simulation does not take ages


lower_and_upper_interval_limit <- .05
lower_and_upper_interval_limit2 <- .025

colors <- brewer.pal(4, "Dark2")
colors2 <- brewer.pal(6, "Set2")

n_scale_on_likert <- c(1:11)
min_likert <- min(n_scale_on_likert)
max_likert <- max(n_scale_on_likert)

sample_sequence <- c(20,30,40,50, 60, 70, 80, 90, 100, 150, 200,seq(250, 3000, by = 250), 5000)
sampled <- data.frame(N_part = sample_sequence)

replications_per_setting <- 400

n_population <- 10^4
prop_minority <- .05

#set seed so every random thing here is reproducible
set.seed(42)

#set echo = FALSE (e.g. dont show code in output) for all chunks, except when explicitly telling otherwise
knitr::opts_chunk$set(
   echo = TRUE,
   warning = FALSE,
   message = FALSE
   )
```

This is the 3rd time I'm trying to estimate the needed sample size for our study. At this point, you should (hopefully) know the drill...   
This time we will try to estimate our needed N with another simulation approach using deviance to a "true score" which we will get with sampling a lot.

We use this sample sequence:  
```{r}
sample_sequence
```

And for each of these sample sizes, we simulate ``r replications_per_setting`` times (while higher numbers would lead to a more robust/ reliable finding, I still need to make sure that it can still be computable)...

Here are our assumed population distribution, which we will sample from again:  

```{r Generation of Population Distribution}

normal <- round(rnorm(n_population, mean = 9,sd = 1.2))
normal <- pmin(pmax(normal, min_likert), max_likert)

strong_pol <- round(rbeta(n_population, shape1 = .1, shape2 = .1)*10) +1
strong_pol <- pmin(pmax(strong_pol, min_likert), max_likert)

rare <- c(round(rnorm(n_population *(1- prop_minority), mean = 2,sd = 1.3)), 
           round(rnorm(n_population *prop_minority, mean = 11,sd = .5)))
rare <- pmin(pmax(rare, min_likert), max_likert)

normal_2 <- round(rnorm(n_population, mean = 8,sd = 1))
normal_2 <- pmin(pmax(normal_2, min_likert), max_likert)


Pop_df <- data.frame(`Normal Distribution`= normal,
                     `Normal Distribution 2` = normal_2,
                     `Rare Polarization` = rare,
                     `Strong Polarization` = strong_pol)

names(Pop_df) <- c("None", "None 2", "Rare", "Strong Pol.")



Pop_df_plot <- Pop_df %>% pivot_longer(everything(), values_to = "Rating", names_to = "Distr") %>% 
   mutate(Distr = factor(Distr, levels = c("None", "None 2","Rare", "Strong Pol.")))

Pop_df_plot %>% ggplot(aes(Rating, fill = Distr))+
   geom_bar(width= .75)+
   facet_grid(rows = vars(Distr))+
   scale_x_continuous(n.breaks = 11, expand = c(0,0))+
   theme_minimal()+
   theme(strip.text = element_blank(),
         axis.text.y = element_blank(),
         legend.position = "none",
         plot.margin = margin(1, 2, 0, 0),
         axis.text.x = element_text(size = 12))+
   geom_text(aes(x=6,y = 3500, label= Distr, color = Distr), size = 6.8)+
   scale_fill_manual(values = colors)+
   scale_color_manual(values = colors)+
   ylab("Count")
```

```{r Sampling, cache=TRUE}
# max_cols <- max(sampled)
# 
# # Register parallel backend with the desired number of cores
# num_cores <- detectCores()-1
# 
# cl <- makeCluster(num_cores)
# registerDoParallel(cl)
# 
# #rotate Pop_df, so our function works (each row needs to be a different distribution, instead of each col)
# rot_Pop_df <- as.data.frame(t(Pop_df))
# 
# risk_dist_tr <- data.frame(risk_distribution = 1:4,
#            risk_dist_transl = rownames(rot_Pop_df))
# 
# # Define function to process each combination of risk_distribution, samplesize and replications per setting
# sample_and_replicate_for_all_risks <- function(i) {
#   sampled_matrix_list <- list()
# 
#   for (j in 1:nrow(rot_Pop_df)) {
#     mat <- replicate(replications_per_setting,
#                      sample(1:n_population, size = sampled[i, 1], replace = TRUE)) #create matrix of our samples with replacing, times n - replications
# 
#     sampled_table <- rot_Pop_df[j, mat] #using the matrix, collect the values from our risk distribution matrix (as a vector though...)
#     sampled_matrix <- as.data.frame(matrix(sampled_table,
#                                            nrow = replications_per_setting,
#                                            ncol = sampled[i, 1],
#                                            byrow = TRUE,
#                                            dimnames = NULL)) #create df out of these vectors instead of flat vectors
# 
# 
#     if (ncol(sampled_matrix) < max_cols) {
#       padding_matrix <- matrix(NA,
#                                nrow = nrow(sampled_matrix),
#                                ncol = max_cols - ncol(sampled_matrix))
#       sampled_matrix <- cbind(sampled_matrix, padding_matrix)
#     } #if matrix is not wide enough for our end result matrix, padd it with NA columns, so binding rows is doable (needs same amount of ncols)
# 
#     sampled_matrix <- cbind(matrix(sampled[i,1], nrow = replications_per_setting),
#                             matrix(j, nrow = replications_per_setting),
#                             sampled_matrix) #bind columns with additional information such as sample size and which risk_distribution was sampled
# 
#     colnames(sampled_matrix) <- c("sample_size", "risk_distribution", paste0("rating_", 1:max_cols)) #rewrite colnames so it is identical to the bigh matrix
#     sampled_matrix_list[[j]] <- sampled_matrix #store in list
#   }
#   return(do.call(rbind, sampled_matrix_list)) #after all risk distributions are sampled from, bind them all and return the output
# 
# }
# 
# # Perform parallel processing using foreach, iterating through the different samplesizes
# result <- foreach(i = 1:nrow(sampled), .combine = rbind) %dopar% {
#   sample_and_replicate_for_all_risks(i)
# }
# 
# # Stop the parallel backend
# stopCluster(cl)
# 
# result <- mutate_all(result, as.numeric)
# 
# 
# vis_miss(result[,seq(from =3, to = ncol(result), length.out =20)], show_perc_col = F)
```  


```{r Calculating Polarisation Measures, cache=TRUE}
#taken from my previous work

# calc_bimodality_coefficient <- function(vec){
#    skew <- skew(vec, na.rm = TRUE, type = 3)
#    kurtosis <- kurtosi(vec, na.rm = TRUE, type = 3)
#    n <- sum(!is.na(vec))
#    return((skew^2+1) / (kurtosis + ((3*((n-1))^2)/((n-2)*(n-3))) ))
# }
# 
# # Created this one "from scratch", as we work with a smaller scale of 11 instead of 101, the calculation of polarization does not take too long anymore
# 
# calc_polarization <- function(vec){
#    vec2 <- as.vector(vec)
#    freq_vec <- agrmt::collapse(vec2, pos = c(1:11))
#    return(agrmt::polarization(freq_vec))
# }
# 
# # this code is commented, as the computation takes too long, and I hate waiting while kniting. I have saved a sepparate rds file, which will be read in instead. Readers who want to uncomment section, select the lines to uncomment and press Ctrl + Shift + C (on Windows/Linux) or Cmd + Shift + C (on macOS)
# #
# # #apply the functions to our result matrix, therefore calculate for each drawn sample the polarization metrics
# BC_result <- apply(result[,-c(1:2)], 1, calc_bimodality_coefficient)
# sum(is.na(BC_result))
# 
# 
# # Register parallel backend with the desired number of cores
# num_cores <- detectCores()-1
# 
# cl <- makeCluster(num_cores)
# registerDoParallel(cl)
# 
# # Perform parallel processing using foreach, iterating through the different drawn samples
# polarization_result <- foreach(i = 1:nrow(result), .combine = rbind) %dopar% {
#   calc_polarization(result[i, -c(1:2)])
# }
# sum(is.na(polarization_result))
# 
# # Stop the parallel backend
# stopCluster(cl)
# 
# 
# combined_result_measures <- cbind(polarization_result,
#                                   BC_result,
#                                   result[,1:2]
#                                   )
# sum(is.na(combined_result_measures))
# 
# 
# combined_result_measures <- combined_result_measures %>%
#    left_join(risk_dist_tr, by = "risk_distribution") %>%
#    mutate(fsample_size = factor(sample_size, ordered = T))
# 
# saveRDS(combined_result_measures, "saved_RDS\\combined_result_measures_power_analysis_deviance.rds")

combined_result_measures <- readRDS("saved_RDS\\combined_result_measures_power_analysis_deviance.rds")



vis_miss(combined_result_measures, sort_miss = F)
```

# Results

In the following density plots, we see the distribution of each calculated **polarization** measure (from the book "Triggerpunkte" from Mau et al.), which ranges from 0 (not polarized, people are in agreement) to 1 (highest polarization). We can see that the more we sample, the more the results converge to a "true" score for said assumed population distribution.

```{r}
combined_result_measures %>% 
   ggplot(aes(polarization_result))+
   geom_density(aes(fill = fsample_size), alpha = .4)+
   facet_grid(rows =vars(risk_dist_transl))
```

And for the bimodality Coefficient as well:

```{r}
combined_result_measures %>% 
   ggplot(aes(BC_result))+
   geom_density(aes(fill = fsample_size), alpha = .4)+
   facet_grid(rows =vars(risk_dist_transl))
```

In the following, we will calculate the **mode** of each density plot (that is, where the polarization measures converges or has a high probability that it ends up on X, **if we were to sample it fewer times**). We can then plot these findings, and should look at the point where the **modes** converges towards the "true score" (in our case, the sample size of 5000).

```{r, fig.width=14}
combined_result_measures <- combined_result_measures %>% 
   mutate(grouped_distXsample = paste(risk_dist_transl, fsample_size, sep = "_"))

splitted <- split(combined_result_measures$polarization_result, f = combined_result_measures$grouped_distXsample)
splitted2 <- split(combined_result_measures$BC_result, f = combined_result_measures$grouped_distXsample)

mode_dens_res <- data.frame()

for (i in 1:length(splitted)) {

   dens_res <- density(splitted[[i]])
   dens_res2 <- density(splitted2[[i]])
   
   dens_df <- data.frame(group = names(splitted)[i],
                         mode_dens_pol = dens_res$x[which.max(dens_res$y)],
                         mode_dens_BC = dens_res2$x[which.max(dens_res2$y)])
   
   mode_dens_res <- rbind(mode_dens_res, dens_df)
   
}

cleaned_mode_dens_res <- mode_dens_res %>% 
   separate(group, sep = "_", into = c("Distribution", "fsample_size")) %>% 
   mutate(Sample_Size = as.integer(fsample_size)) %>% 
   pivot_longer(cols = starts_with("mode_dens"),
                names_prefix = "mode_dens_",
                names_to = "measure",
                values_to = "value")
   

cleaned_mode_dens_res %>% 
   ggplot(aes(Sample_Size, value, group = Distribution, col = Distribution))+
   geom_line(size= 1.1)+
   facet_grid(rows = vars(measure))+
   scale_x_continuous(breaks = sample_sequence)+
   theme_minimal()+
   scale_color_manual(values = colors)

```

## Probability of Sample being in True Score Interval {.tabset .tabset-pills}

While the mode could a be good indicator, one might also argue that it is more relevant to have a good chance of capturing the true polarization value. That is, the probability that the measure is only off by only .1 (meaning it is always within a interval of + 0.05 and - 0.05) from the true score. So let's calculate the % where the ``r replications_per_setting`` simulations per sample size was within this interval. One could then interpret the results as: **the probability of coming to the same polarization value (with an interval of .1) compared to if we had sampled 5'000 people**.

### Interval of .1

```{r, fig.width= 14}
# calculate the "true score" value of said polarization measure for each population distribution

true_score <- combined_result_measures %>% 
   filter(sample_size == 5000) %>% 
   group_by(risk_dist_transl) %>% 
   summarize(mean_true_score_BC = mean(BC_result),
             mean_true_score_pol = mean(polarization_result))




combined_result_measures_within_interval <- combined_result_measures %>% 
   left_join(true_score, by = c("risk_dist_transl")) %>% 
   mutate(within_BC = if_else(BC_result >= mean_true_score_BC- lower_and_upper_interval_limit &
                               BC_result <= mean_true_score_BC + lower_and_upper_interval_limit, 1, 0),
          within_pol = if_else(polarization_result >= mean_true_score_pol- lower_and_upper_interval_limit &
                               polarization_result <= mean_true_score_pol + lower_and_upper_interval_limit, 1, 0))

summarized_within_interval <-
   combined_result_measures_within_interval %>%
   group_by(risk_dist_transl, sample_size) %>%
   summarize(count_within_BC = sum(within_BC),
      count_within_pol = sum(within_pol),
      perc_within_BC = sum(within_BC) / replications_per_setting * 100,
      perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>%
   pivot_longer(cols = starts_with("perc"),
      names_to = "measure",
      values_to = "percentage",
      names_prefix = "perc_within_")

summarized_within_interval %>% ggplot(aes(factor(sample_size), percentage, col = risk_dist_transl, group = risk_dist_transl))+
   geom_point()+
   geom_line()+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.05 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.05")+
   scale_color_manual(values = colors)+
   theme(legend.position = "none")
   

```

### Interval of .05 {.active}

As the interval of .05 (which results of a coverage of .1) was "arbitrarily" taken, let's use the same procedure for an interval of .025 (or a span of only 0.05). In this case, one would expect that we need a bigger sample size, as the margin for "error" is smaller where we would be confident enough that we have captured the true score.


```{r, fig.width=14}
combined_result_measures_within_interval <- combined_result_measures %>% 
   left_join(true_score, by = c("risk_dist_transl")) %>% 
   mutate(within_BC = if_else(BC_result >= mean_true_score_BC- lower_and_upper_interval_limit2 &
                               BC_result <= mean_true_score_BC + lower_and_upper_interval_limit2, 1, 0),
          within_pol = if_else(polarization_result >= mean_true_score_pol- lower_and_upper_interval_limit2 &
                               polarization_result <= mean_true_score_pol + lower_and_upper_interval_limit2, 1, 0))

summarized_within_interval <-
   combined_result_measures_within_interval %>%
   group_by(risk_dist_transl, sample_size) %>%
   summarize(count_within_BC = sum(within_BC),
      count_within_pol = sum(within_pol),
      perc_within_BC = sum(within_BC) / replications_per_setting * 100,
      perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>%
   pivot_longer(cols = starts_with("perc"),
      names_to = "measure",
      values_to = "percentage",
      names_prefix = "perc_within_")

summarized_within_interval %>% ggplot(aes(factor(sample_size), percentage, col = risk_dist_transl, group = risk_dist_transl))+
   geom_point()+
   geom_line()+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.025 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.025")+
   scale_color_manual(values = colors)+
   theme(legend.position = "none")

```

## Difference between two polarizations {.tabset .tabset-pills}

The approach above would only tell us how likely it is that the sampled distribution would lead to almost the same polarization value if one had sampled 5'000 people (or in a sense, a "true score", as it converges to said population distribution). In the following section, we will go one step further and analyse how likely it is to find the same difference between **two** population distributions and two sampled distributions.  

```{r}
true_score_long <- true_score %>% 
   pivot_longer(cols = starts_with("mean_true_score"),
                names_prefix = "mean_true_score_",
                names_to = "measure",
                values_to = "value")


true_score_expanded <- expand.grid(x = true_score$risk_dist_transl, y =true_score$risk_dist_transl) %>% 
   filter(x != y) %>% 
   distinct() %>% 
   left_join(true_score, by = c("x" = "risk_dist_transl")) %>% 
   left_join(true_score, by = c("y" = "risk_dist_transl"), suffix = c("_x", "_y")) %>% 
   mutate(true_diff_BC = mean_true_score_BC_x - mean_true_score_BC_y,
          true_diff_pol = mean_true_score_pol_x - mean_true_score_pol_y)


#because we have duplicate combinations (e.g. abs(x - y) == abs(y - x)), I'll sort them and take half of it (e.g. only the neccessary combinations)
n_unique <- nrow(true_score) * (nrow(true_score)-1) / 2

true_score_unique <- true_score_expanded %>% 
   arrange(desc(true_diff_BC)) %>% 
   slice_head(n = n_unique) %>% 
   select(x, y, true_diff_BC, true_diff_pol)



true_score_unique %>% 
   kable(digits = 2)
```

These are the true score differences between two distributions if one were to average the difference from sampling 5000 ratings for each population.  


In the following, we will calculate the difference between the calculated polarization measures from another distribution. Again, we will take two intervals:
.1 and .05 (that is, the sampled difference is allowed to deviate by .05 or .025 from the true score on either side).

### Interval of .1

```{r, fig.width= 14}

combined_result_measures_selected <- combined_result_measures %>% 
   select(contains("result"), sample_size, risk_dist_transl) %>% 
   group_by(risk_dist_transl, sample_size) %>% 
   mutate(sample_ID = row_number())

diff_distributions <- true_score_unique %>% 
   left_join(combined_result_measures_selected, by = c("x" = "risk_dist_transl")) %>% 
   left_join(combined_result_measures_selected, by = c("y" = "risk_dist_transl", "sample_size", "sample_ID"), suffix = c("_x", "_y")) %>% 
   mutate(diff_BC = BC_result_x - BC_result_y,
          diff_pol = polarization_result_x - polarization_result_y)

within_interval_diff_distributions <- diff_distributions %>% 
   mutate(within_BC = if_else(true_diff_BC >= diff_BC - lower_and_upper_interval_limit &
                                 true_diff_BC <= diff_BC + lower_and_upper_interval_limit, 1, 0),
          within_pol = if_else(true_diff_pol >= diff_pol - lower_and_upper_interval_limit &
                                 true_diff_pol <= diff_pol + lower_and_upper_interval_limit, 1, 0),
          comparison = paste(x, y, sep = " - ")
   ) %>% 
   group_by(comparison, sample_size) %>% 
   summarize(perc_within_BC = sum(within_BC) / replications_per_setting * 100,
             perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>% 
   pivot_longer(cols = starts_with("perc"),
                names_to = "measure",
                values_to = "percentage",
                names_prefix = "perc_within_")
   

within_interval_diff_distributions %>% ggplot(aes(factor(sample_size), percentage, col = factor(comparison), group = factor(comparison)))+
   geom_point(size = 2)+
   geom_line(linewidth = 1.2)+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.05 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.05",
        color = "Comparison between")+
   scale_color_manual(values = colors2)+
   theme(legend.spacing = unit(0.1, "cm"),
         legend.margin = margin(0,0,0,0),
         legend.box.margin = margin(0,0,0,0),
         legend.position = c(.5,.05),
         legend.direction = "horizontal")
   


```

### Interval of .05 {.active}
```{r, fig.width= 14}

within_interval_diff_distributions <- diff_distributions %>% 
   mutate(within_BC = if_else(true_diff_BC >= diff_BC - lower_and_upper_interval_limit2 &
                                 true_diff_BC <= diff_BC + lower_and_upper_interval_limit2, 1, 0),
          within_pol = if_else(true_diff_pol >= diff_pol - lower_and_upper_interval_limit2 &
                                 true_diff_pol <= diff_pol + lower_and_upper_interval_limit2, 1, 0),
          comparison = paste(x, y, sep = " - ")
   ) %>% 
   group_by(comparison, sample_size) %>% 
   summarize(perc_within_BC = sum(within_BC) / replications_per_setting * 100,
             perc_within_pol = sum(within_pol) / replications_per_setting * 100) %>% 
   pivot_longer(cols = starts_with("perc"),
                names_to = "measure",
                values_to = "percentage",
                names_prefix = "perc_within_")
   

within_interval_diff_distributions %>% ggplot(aes(factor(sample_size), percentage, col = factor(comparison), group = factor(comparison)))+
   geom_point(size = 2)+
   geom_line(linewidth = 1.2)+
   facet_wrap(~measure)+
   theme_minimal()+
   scale_y_continuous(breaks = seq(0, 100, by = 10))+
   labs(y = "percentage of simulations within 0.025 interval of true score",
        x = "sample size",
        title = "Interval of +- 0.025",
        color = "Comparison between")+
   scale_color_manual(values = colors2)+
   theme(legend.spacing = unit(0.1, "cm"),
         legend.margin = margin(0,0,0,0),
         legend.box.margin = margin(0,0,0,0),
         legend.position = c(.5,.05),
         legend.direction = "horizontal")
```

# Discussion

As one has expected, the "needed" sample size increases the stricter we set the intervals which we would be satisfied with.  

If one were to only look at the plots with the stricter interval:

- On the side of bimodality coefficient, one could argue to sample around 1250 participants so that we have a good chance of around 80% to get a good estimate for [**rare distributions**]{style="color: `r colors[3]`;"} (so that in case we do sample from a rare distribution, in 80% of the time, the true bimodality coefficient is off by only a small amount). If one wants to make sure, even sampling 2000 participants would be nice to increase the chance to 90%, but maybe that is stretching the budget a little bit...

- For the measure of polarization, estimating the true score value is hardest if one has a [**strongly polarized**]{style="color: `r colors[4]`;"} distribution. In this case, we have a 80% chance of getting it "right" if one would sample 1250 participants, or 90% for sampling 2000 respectively.

The numbers here are only suggestions, and are only based on **precision**, so that we can be sure that the results we get are reliable enough. In any case, these numbers are "only" worst case scenarios, as the other population distributions are converging earlier. We are also only looking at the stricter scenario...

When looking for the difference between two distributions, one need to sample 1500 or even 2750 people for 80% or 90% chance of being near the true score.  


### Limitations

- Discussion was solely based on the stricter interval (e.g. the last plot), not on the lenient one with a broader interval.
- As always, the simulation should be taken with a grain of salt, as these results are made with the assumption that the distribution is as "beautiful" as I made it in the introduction section... 
- Whether we can truly sample as random as we want to, or we get some subsamples is not of debate here. This work just assumes that the whole population has X distribution.
- No comparison between different sample distributions were made (e.g. overlap coefficient), so we still have no idea how big the sample size needs to be if we want to discover a difference of X between two countries/ groups **in case there is a difference**.

# Credits


## R Packages Used

- agrmt (Ruedin D (2023). _agrmt: Calculate Concentration and Dispersion in Ordered Rating Scales_. R package version 1.42.12,
<https://CRAN.R-project.org/package=agrmt>.)  

- doParallel (Corporation M, Weston S (2022). _doParallel: Foreach Parallel Adaptor for the 'parallel' Package_. R package version 1.0.17,
<https://CRAN.R-project.org/package=doParallel>.)  

- foreach (Microsoft, Weston S (2022). _foreach: Provides Foreach Looping Construct_. R package version 1.5.2, <https://CRAN.R-project.org/package=foreach>.)  

- knitr (Xie Y (2023). _knitr: A General-Purpose Package for Dynamic Report Generation in R_. R package version 1.45, <https://yihui.org/knitr/>.)  

- psych (William Revelle (2023). _psych: Procedures for Psychological, Psychometric, and Personality Research_. Northwestern University, Evanston, Illinois. R
package version 2.3.9, <https://CRAN.R-project.org/package=psych>.)  

- RColorBrewer (Neuwirth E (2022). _RColorBrewer: ColorBrewer Palettes_. R package version 1.1-3, <https://CRAN.R-project.org/package=RColorBrewer>.)  

- rmarkdown (Allaire J, Xie Y, Dervieux C, McPherson J, Luraschi J, Ushey K, Atkins A, Wickham H, Cheng J, Chang W, Iannone R (2023). _rmarkdown: Dynamic
Documents for R_. R package version 2.25, <https://github.com/rstudio/rmarkdown>.)  

- tidyverse (Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R, Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM,
Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). “Welcome to the tidyverse.” _Journal of
Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.)  

- visdat (Tierney N (2017). “visdat: Visualising Whole Data Frames.” _JOSS_, *2*(16), 355. doi:10.21105/joss.00355 <https://doi.org/10.21105/joss.00355>,
<http://dx.doi.org/10.21105/joss.00355>.)  
